# This file replicates the analysis in my March 2014 deadspin article
# Stephen Pettigrew
# pettigrew@fas.harvard.edu

load("all brackets.RData")

experts <- read.csv("experts.csv")
experts$espn.username <- NULL
experts$bracket.source <- NULL


# Clean up the data to account for the points earned in the final four
experts$elite8 <- rowSums(cbind((experts$south == "florida") * 80,
                                (experts$east == "uconn") * 80,
                                (experts$west == "wisconsin") * 80,
                                (experts$midwest == "kentucky") * 80))
experts$ff <- rowSums(cbind((experts$finalist1 == "uconn") * 160,
                            (experts$finalist2 == "kentucky") * 160))
experts$finals <- 320 * (experts$champ == "uconn")


brackets$elite8 <- rowSums(cbind((brackets$ff1 == "florida") * 80,
                                 (brackets$ff2 == "uconn") * 80,
                                 (brackets$ff3 == "wisconsin") * 80,
                                 (brackets$ff4 == "kentucky") * 80))
brackets$ff <- rowSums(cbind((brackets$finalist1 == "uconn") * 160,
                            (brackets$finalist2 == "kentucky") * 160))
brackets$finals <- 320 * (brackets$champ == "uconn")

experts$points <- rowSums(cbind(experts$round64,
                               experts$round32,
                               experts$sweet16,
                               experts$elite8,
                               experts$ff,
                               experts$finals))
brackets$points <- rowSums(cbind(brackets$round1,
                                brackets$round2,
                                brackets$sweet16,
                                brackets$elite8,
                                brackets$ff,
                                brackets$finals))



# Get the total number of games correctly predicted
experts$games <- rowSums(cbind(experts$round64 / 10,
                               experts$round32 / 20,
                               experts$sweet16 / 40,
                               experts$elite8 / 80,
                               experts$ff / 160,
                               experts$finals / 320))
brackets$games <- rowSums(cbind(brackets$round1 / 10,
                                brackets$round2 / 20,
                                brackets$sweet16 / 40,
                                brackets$elite8 / 80,
                                brackets$ff / 160,
                                brackets$finals / 320))

# Get the number of correctly predicted first round games
experts$firstround <- experts$round64 / 10
brackets$firstround <- brackets$round1 / 10

# Get total number of final four qualifiers
experts$finalfourqual <- experts$elite8 / 80
brackets$finalfourqual <- brackets$elite8 / 80


# Get the total number of points earned
experts$points <- rowSums(cbind(experts$round64,
                                experts$round32,
                                experts$sweet16,
                                experts$elite8,
                                experts$ff,
                                experts$finals))


# All chalk
sum(brackets$round1 == 240 &
      brackets$round2 == 200  &
      brackets$sweet16 == 160 &
      brackets$elite8 == 80 &
      brackets$ff1 == "florida" & 
      brackets$ff2 == "virginia" &
      brackets$ff3 == "arizona" &
      brackets$ff4 == "wichita state")

# All upsets bracket numbers
sum(brackets$round1 == 80 & brackets$round2 == 0)



# Generate spreadsheet of experts data
experts.print <- data.frame("Media Outlet" = experts$media.outlet,
                            "Expert" = experts$expert,
                            "Total Correct" = experts$games,
                            "Round 64 Correct" = experts$firstround,
                            "Total Points" = experts$points,
                            "Correct Final Four Qualifiers" = experts$finalfourqual)
write.csv(experts.print, 
          file = "experts table.csv",
          #col.names = c("Media Outlet", "Expert", "Total Correct", "Round 64 Correct",
          #              "Total Points", "Correct Final Four Qualifiers"),
          row.names = F)



##############################
# First look at the total number of games correctly predicted

games.test <- t.test(brackets$games, experts$games)
games.test$estimate 

games.test$estimate[1] - games.test$estimate[2]

games.test$statistic

(games.test$estimate[1] - games.test$estimate[2]) / games.test$statistic

games.test$p.value

games.test$conf.int

sum(brackets$games > max(experts$games))
sum(brackets$games > max(experts$games)) / nrow(brackets)


t.test(brackets$games[brackets$games > 30], experts$games)$p.value >= .05
# You have to get rid of all brackets with fewer than 30 games correct
mean(brackets$games <= 30)
# This is the bottom 9.74% of brackets




results.table <- data.frame(matrix(NA, ncol = 7, nrow = 4))
rownames(results.table) <- c("Total games", "Points", "Round of 32", "Final Four")
colnames(results.table) <- c("Expert mean", "Non-expert mean",
                             "Difference", "Std. Error of Diff.",
                             "p-value",
                             "Number better than best expert",
                             "Percent better than best expert")

results.table[1,] <- c(games.test$estimate[2],
                       games.test$estimate[1],
                       games.test$estimate[2] - games.test$estimate[1],
                       (games.test$estimate[1] - games.test$estimate[2]) / games.test$statistic,
                       games.test$p.value,
                       sum(brackets$games > max(experts$games)),
                       100 * sum(brackets$games > max(experts$games)) / nrow(brackets))



##############################
# Now look at the total number of points earned

points.test <- t.test(brackets$points, experts$points)
points.test$estimate 

points.test$estimate[1] - points.test$estimate[2]

points.test$statistic

(points.test$estimate[1] - points.test$estimate[2]) / points.test$statistic

points.test$p.value

points.test$conf.int

sum(brackets$points > max(experts$points))
sum(brackets$points > max(experts$points)) / nrow(brackets)

t.test(brackets$points[brackets$points > 440], experts$points)$p.value >= .05
mean(brackets$points <= 440)


results.table[2,] <- c(points.test$estimate[2],
                       points.test$estimate[1],
                       points.test$estimate[2] - points.test$estimate[1],
                       (points.test$estimate[1] - points.test$estimate[2]) / points.test$statistic,
                       points.test$p.value,
                       sum(brackets$points > max(experts$points)),
                       100 * sum(brackets$points > max(experts$points)) / nrow(brackets))







##############################
# Now look at the total number of firstround games correctly predicted

round1.test <- t.test(brackets$firstround, experts$firstround)
round1.test$estimate 

round1.test$estimate[1] - round1.test$estimate[2]

round1.test$statistic

(round1.test$estimate[1] - round1.test$estimate[2]) / round1.test$statistic

round1.test$p.value

round1.test$conf.int

sum(brackets$firstround > max(experts$firstround))
sum(brackets$firstround > max(experts$firstround)) / nrow(brackets)

t.test(brackets$firstround[brackets$firstround > 12], experts$firstround)$p.value >= .05
mean(brackets$firstround <= 12)

results.table[3,] <- c(round1.test$estimate[2],
                       round1.test$estimate[1],
                       round1.test$estimate[2] - round1.test$estimate[1],
                       (round1.test$estimate[1] - round1.test$estimate[2]) / round1.test$statistic,
                       round1.test$p.value,
                       sum(brackets$firstround > max(experts$firstround)),
                       100 * sum(brackets$firstround > max(experts$firstround)) / nrow(brackets))





##############################
# Now look at the total number of final four teams correctly predicted

ff.test <- t.test(brackets$finalfourqual, experts$finalfourqual)
ff.test$estimate 

ff.test$estimate[1] - ff.test$estimate[2]

ff.test$statistic

(ff.test$estimate[1] - ff.test$estimate[2]) / ff.test$statistic

ff.test$p.value

ff.test$conf.int

sum(brackets$finalfourqual > max(experts$finalfourqual))
sum(brackets$finalfourqual > max(experts$finalfourqual)) / nrow(brackets)


results.table[4,] <- c(ff.test$estimate[2],
                       ff.test$estimate[1],
                       ff.test$estimate[2] - ff.test$estimate[1],
                       (ff.test$estimate[1] - ff.test$estimate[2]) / ff.test$statistic,
                       ff.test$p.value,
                       sum(brackets$finalfourqual > max(experts$finalfourqual)),
                       100 * sum(brackets$finalfourqual > max(experts$finalfourqual)) / nrow(brackets))


# Don't have to get rid of any obs to have no diff between experts and non-experts


results.table[,c(1:5,7)] <- apply(results.table[,c(1:5,7)], 2, round, digits = 2)

write.csv(results.table, file = "results table.csv")




##########################################
## Graphs

# total number of games correctly predicted

png("games.png",
    height = 1440,
    width = 2160,
    res = 288)

hist(brackets$games,
     col = "Red",
     xlab = "Games correctly predicted",
     ylab = "Number of Tournament Challenge brackets",
     main = "Total number of games correctly predicted \n experts vs. everyone else",
     yaxt = "n")
dots <- table(experts$games)
for(i in 1:length(dots)){
  points(x = rep(names(dots)[i], dots[i]),
         y = c(1:dots[i]) * 100000,
         pch = 18,
         col = "blue",
         cex = 1.5)
}

axis(2, at = seq(0, 3000000, 500000),
     labels = c(0, "500k",paste0(seq(1,3,.5), "m")),
     las = 2)
graphics.off()



png("points.png",
    height = 1440,
    width = 2160,
    res = 288)
hist(brackets$points,
     col = "Red",
     xlab = "Total points",
     ylab = "Number of Tournament Challenge brackets",
     main = "Total points earned \n experts vs. everyone else",
     yaxt = "n")
dots <- table(experts$points)
for(i in 1:length(dots)){
  points(x = rep(names(dots)[i], dots[i]),
         y = c(1:dots[i]) * 100000,
         pch = 18,
         col = "blue",
         cex = 1)
}
axis(2, at = seq(0, 3000000, 500000),
     labels = c(0, "500k",paste0(seq(1,3,.5), "m")),
     las = 2)
graphics.off()



png("firstround.png",
    height = 1440,
    width = 2160,
    res = 288)
hist(brackets$firstround,
     col = "Red",
     xlab = "Games correctly predicted",
     ylab = "Number of Tournament Challenge brackets",
     main = "Round of 64 games correctly predicted \n experts vs. everyone else",
     yaxt = "n")
segments(x0 = jittered,
         x1 = jittered,
         y0 = 0,
         y1 = par()$usr[3],
         col = "blue")
axis(2, at = seq(0, 3000000, 500000),
     labels = c(0, "500k",paste0(seq(1,3,.5), "m")),
     las = 2)
graphics.off()